Library

library(Rfastp)
library(qckitfastq)
library(plotly)
library(plyr)

Data

folder <- "../01_data"
files <- list.files(path = folder, pattern = ".gz", full.names = T)
names(files) <- gsub(pattern = "\\.fastq.gz", replacement = "", x = basename(files))

Rfastp

Analysis

qc_SE <- function(file, outdir, name, ad1) {
  res <- rfastp(
    read1 = file,
    outputFastq = paste(outdir, name, sep = "/"),
    thread = 4,
    cutMeanQual = 20,
    cutFrontMeanQual = 20,
    cutTailMeanQual = 20,
    cutLowQualTail = T,
    cutLowQualFront = T,
    qualityFiltering = T,
    maxNfilter = 5,
    qualityFilterPhred = 20,
    qualityFilterPercent = 40,
    lengthFiltering = T,
    minReadLength = 15,
    maxIndexMismatch = 1,
    adapterTrimming = TRUE,
    adapterSequenceRead1 = ad1
  )
  # file.remove(paste0(outdir, "/", name, ".html"))
  return(res)
}

res <- parallel::mclapply(setNames(names(files), names(files)), function(x) {
  qc_SE(file = files[[x]], outdir = "output/", name = x, ad1 = "TGGAATTCTCGGGTGCCAAGG")
}, mc.preschedule = F, mc.cores = 16)

json_files <- list.files(path = "output", pattern = "json", full.names = T)
names(json_files) <- gsub(pattern = "\\.json", replacement = "", x = basename(json_files))

trim_files <- list.files(path = "output", pattern = "gz", full.names = T)
names(trim_files) <- gsub(pattern = "\\_R1.fastq.gz", replacement = "", x = basename(trim_files))

Reading and getting information from JSON file

General

#' Make general information data frame from
#'
#' @param json A JSON file with QC information
#'
#' @return A data frame with FASTQ file information
#' @export
#'
#' @examples
make_general_df <- function(json) {
  js <- rjson::fromJSON(file = json)

  # pkg <- as.character(packageVersion("Rfastp"))
  seq <- js$read1_before_filtering$total_cycles
  seq_type <- ifelse(test = any(grepl(pattern = "read2", x = names(js))), yes = "paired-end", no = "single-end")
  cycle <- paste0(seq_type, " (", seq, " cycles)")

  mean_len_bf <- js$summary$before_filtering$read1_mean_length
  mean_len_af <- js$summary$after_filtering$read1_mean_length
  dup_rate <- ifelse(test = seq_type == "paired-end",
    yes = paste0(round(x = (js$duplication$rate * 100), digits = 2), "%"),
    no = paste0(
      round(x = (js$duplication$rate * 100), digits = 2),
      "% (may be overestimated since this is SE data)"
    )
  )

  df <- data.frame(
    test = c(
      # "Rfastp version",
      "Sequencing",
      "Mean length before filtering",
      "Mean length after filtering",
      "Duplication rate"
    ),
    value = c(
      # pkg,
      cycle,
      mean_len_bf,
      mean_len_af,
      dup_rate
    )
  )
  return(df)
}

parallel::mclapply(setNames(names(json_files), names(json_files)), function(x) {
  knitr::kable(make_general_df(json = json_files[[x]]), col.names = NULL)
}, mc.preschedule = F, mc.cores = 4)

$SRR13129036

Sequencing single-end (43 cycles)
Mean length before filtering 31
Mean length after filtering 30
Duplication rate 8.93% (may be overestimated since this is SE data)

$SRR13129037

Sequencing single-end (43 cycles)
Mean length before filtering 30
Mean length after filtering 29
Duplication rate 5.89% (may be overestimated since this is SE data)

$SRR13129038

Sequencing single-end (43 cycles)
Mean length before filtering 27
Mean length after filtering 27
Duplication rate 4.62% (may be overestimated since this is SE data)

$SRR13129039

Sequencing single-end (43 cycles)
Mean length before filtering 28
Mean length after filtering 28
Duplication rate 5.01% (may be overestimated since this is SE data)

$SRR13129040

Sequencing single-end (43 cycles)
Mean length before filtering 29
Mean length after filtering 28
Duplication rate 5.1% (may be overestimated since this is SE data)

$SRR13129041

Sequencing single-end (43 cycles)
Mean length before filtering 31
Mean length after filtering 28
Duplication rate 8.65% (may be overestimated since this is SE data)

$SRR13129042

Sequencing single-end (43 cycles)
Mean length before filtering 33
Mean length after filtering 30
Duplication rate 10.17% (may be overestimated since this is SE data)

$SRR13129043

Sequencing single-end (43 cycles)
Mean length before filtering 33
Mean length after filtering 30
Duplication rate 11.27% (may be overestimated since this is SE data)

Before QC

make_before_filt_df <- function(json) {
  js <- rjson::fromJSON(file = json)
  tot_reads <- js$read1_before_filtering$total_reads
  tot_bases <- js$read1_before_filtering$total_bases
  q_20_bases <- js$read1_before_filtering$q20_bases
  q_20_rate <- js$summary$before_filtering$q20_rate * 100
  q_30_bases <- js$read1_before_filtering$q30_bases
  q_30_rate <- js$summary$before_filtering$q30_rate * 100
  gc <- paste0(round(x = js$summary$before_filtering$gc_content * 100, digits = 2), "%")

  df <- data.frame(
    test = c(
      "Total reads",
      "Total bases",
      "Q20 bases",
      "Q30 bases",
      "GC content"
    ),
    value = c(
      tot_reads,
      tot_bases,
      paste0(q_20_bases, " (", q_20_rate, "%)"),
      paste0(q_30_bases, " (", q_30_rate, "%)"),
      gc
    )
  )
  return(df)
}


parallel::mclapply(setNames(names(json_files), names(json_files)), function(x) {
  knitr::kable(make_before_filt_df(json = json_files[[x]]), col.names = NULL)
}, mc.preschedule = F, mc.cores = 4)

$SRR13129036

Total reads 11185006
Total bases 357252971
Q20 bases 334375297 (93.5962%)
Q30 bases 314060291 (87.9098%)
GC content 53.47%

$SRR13129037

Total reads 7867867
Total bases 240054600
Q20 bases 225451404 (93.9167%)
Q30 bases 212505540 (88.5238%)
GC content 52.17%

$SRR13129038

Total reads 10194925
Total bases 281230897
Q20 bases 266372556 (94.7167%)
Q30 bases 252189890 (89.6736%)
GC content 52.86%

$SRR13129039

Total reads 7830099
Total bases 224453362
Q20 bases 210404041 (93.7407%)
Q30 bases 197933302 (88.1846%)
GC content 53.33%

$SRR13129040

Total reads 8208659
Total bases 244636315
Q20 bases 230210355 (94.1031%)
Q30 bases 217184942 (88.7787%)
GC content 52.93%

$SRR13129041

Total reads 5001369
Total bases 156293132
Q20 bases 143287472 (91.6787%)
Q30 bases 132726493 (84.9215%)
GC content 50.94%

$SRR13129042

Total reads 4501226
Total bases 148716550
Q20 bases 136504656 (91.7885%)
Q30 bases 126439280 (85.0203%)
GC content 50.99%

$SRR13129043

Total reads 4384159
Total bases 146604195
Q20 bases 134363573 (91.6506%)
Q30 bases 124460441 (84.8956%)
GC content 50.92%

After QC

make_after_filt_df <- function(json) {
  js <- rjson::fromJSON(file = json)
  tot_reads <- js$read1_after_filtering$total_reads
  tot_bases <- js$read1_after_filtering$total_bases
  q_20_bases <- js$read1_after_filtering$q20_bases
  q_20_rate <- js$summary$after_filtering$q20_rate * 100
  q_30_bases <- js$read1_after_filtering$q30_bases
  q_30_rate <- js$summary$after_filtering$q30_rate * 100
  gc <- paste0(round(x = js$summary$after_filtering$gc_content * 100, digits = 2), "%")

  df <- data.frame(
    test = c(
      "Total reads",
      "Total bases",
      "Q20 bases",
      "Q30 bases",
      "GC content"
    ),
    value = c(
      tot_reads,
      tot_bases,
      paste0(q_20_bases, " (", q_20_rate, "%)"),
      paste0(q_30_bases, " (", q_30_rate, "%)"),
      gc
    )
  )
  return(df)
}


parallel::mclapply(setNames(names(json_files), names(json_files)), function(x) {
  knitr::kable(make_after_filt_df(json = json_files[[x]]), col.names = NULL)
}, mc.preschedule = F, mc.cores = 4)

$SRR13129036

Total reads 10827559
Total bases 329736216
Q20 bases 314144334 (95.2714%)
Q30 bases 297339513 (90.175%)
GC content 53.6%

$SRR13129037

Total reads 7450311
Total bases 216714804
Q20 bases 207087385 (95.5576%)
Q30 bases 196728119 (90.7774%)
GC content 52.16%

$SRR13129038

Total reads 9452835
Total bases 261140391
Q20 bases 250299658 (95.8487%)
Q30 bases 238272627 (91.2431%)
GC content 52.65%

$SRR13129039

Total reads 7176073
Total bases 202756457
Q20 bases 193614861 (95.4913%)
Q30 bases 183785082 (90.6433%)
GC content 53.24%

$SRR13129040

Total reads 7816079
Total bases 225710316
Q20 bases 215794836 (95.607%)
Q30 bases 205089922 (90.8642%)
GC content 52.91%

$SRR13129041

Total reads 4737687
Total bases 136251428
Q20 bases 128073054 (93.9976%)
Q30 bases 120073758 (88.1266%)
GC content 50.98%

$SRR13129042

Total reads 4297238
Total bases 131971658
Q20 bases 124260094 (94.1567%)
Q30 bases 116608772 (88.359%)
GC content 51%

$SRR13129043

Total reads 4197599
Total bases 129431304
Q20 bases 121890795 (94.1741%)
Q30 bases 114301250 (88.3104%)
GC content 50.92%

Filtering result

make_filt_df <- function(json) {
  js <- rjson::fromJSON(file = json)

  reads <- js$summary$before_filtering$total_reads
  read_pass <- js$filtering_result$passed_filter_reads
  read_pass_pct <- round((read_pass / reads) * 100, 2)
  read_low_Q <- js$filtering_result$low_quality_reads
  read_low_Q_pct <- round((read_low_Q / reads) * 100, 2)
  read_with_N <- js$filtering_result$too_many_N_reads
  read_with_N_pct <- round((read_with_N / reads) * 100, 2)
  read_short <- js$filtering_result$too_short_reads
  read_short_pct <- round((read_short / reads) * 100, 2)
  read_long <- js$filtering_result$too_long_reads
  read_long_pct <- round((read_pass / reads) * 100, 2)

  df <- data.frame(
    test = c(
      "Reads passed filters",
      "Reads with low quality",
      "Reads with too many N",
      "Reads too short",
      "Reads too long"
    ),
    value = c(
      paste0(read_pass, " (", read_pass_pct, "%)"),
      paste0(read_low_Q, " (", read_low_Q_pct, "%)"),
      paste0(read_with_N, " (", read_with_N_pct, "%)"),
      paste0(read_short, " (", read_short_pct, "%)"),
      paste0(read_long, " (", read_long_pct, "%)")
    )
  )
  return(df)
}


parallel::mclapply(setNames(names(json_files), names(json_files)), function(x) {
  knitr::kable(make_filt_df(json = json_files[[x]]), col.names = NULL)
}, mc.preschedule = F, mc.cores = 4)

$SRR13129036

Reads passed filters 10827559 (96.8%)
Reads with low quality 114349 (1.02%)
Reads with too many N 0 (0%)
Reads too short 243098 (2.17%)
Reads too long 0 (96.8%)

$SRR13129037

Reads passed filters 7450311 (94.69%)
Reads with low quality 71235 (0.91%)
Reads with too many N 0 (0%)
Reads too short 346321 (4.4%)
Reads too long 0 (94.69%)

$SRR13129038

Reads passed filters 9452835 (92.72%)
Reads with low quality 61669 (0.6%)
Reads with too many N 0 (0%)
Reads too short 680421 (6.67%)
Reads too long 0 (92.72%)

$SRR13129039

Reads passed filters 7176073 (91.65%)
Reads with low quality 66473 (0.85%)
Reads with too many N 0 (0%)
Reads too short 587553 (7.5%)
Reads too long 0 (91.65%)

$SRR13129040

Reads passed filters 7816079 (95.22%)
Reads with low quality 65540 (0.8%)
Reads with too many N 0 (0%)
Reads too short 327040 (3.98%)
Reads too long 0 (95.22%)

$SRR13129041

Reads passed filters 4737687 (94.73%)
Reads with low quality 66542 (1.33%)
Reads with too many N 0 (0%)
Reads too short 197140 (3.94%)
Reads too long 0 (94.73%)

$SRR13129042

Reads passed filters 4297238 (95.47%)
Reads with low quality 59790 (1.33%)
Reads with too many N 0 (0%)
Reads too short 144198 (3.2%)
Reads too long 0 (95.47%)

$SRR13129043

Reads passed filters 4197599 (95.74%)
Reads with low quality 66768 (1.52%)
Reads with too many N 0 (0%)
Reads too short 119792 (2.73%)
Reads too long 0 (95.74%)

Duplicated reads plot

make_dup_plot <- function(json) {
  js <- rjson::fromJSON(file = json)

  bars <- js$duplication$histogram
  gc <- js$duplication$mean_gc
  xaxis <- 1:length(bars)

  data <- data.frame(
    Duplicate = round(bars / sum(bars) * 100, 2),
    GC = round(gc * 100, 2),
    x = xaxis
  )
  data$GC[data$GC == 0] <- ""

  p <- plot_ly(data, x = ~x) %>%
    add_trace(
      y = ~Duplicate,
      type = "bar",
      name = "Read percent",
      hoverinfo = "text+x",
      hovertext = paste(
        "<b>", data$Duplicate, "</b>"
      )
    ) %>%
    add_lines(
      y = ~GC,
      name = "Mean GC ratio (%)",
      line = list(color = "red", width = 3),
      hoverinfo = "text+x",
      hovertext = paste(
        "<b>", data$GC, "</b>"
      )
    ) %>%
    layout(
      title = paste0("Duplication rate: ", round(js$duplication$rate * 100, 2), "%"),
      xaxis = list(title = "Duplication level"),
      yaxis = list(title = "Reads percent & GC ratio (%)"),
      hovermode = "x closest"
    )
  print(htmltools::tagList(p))
}

parallel::mclapply(setNames(names(json_files), names(json_files)), function(x) {
  make_dup_plot(json = json_files[[x]])
}, mc.preschedule = F, mc.cores = 4)
$SRR13129036
$SRR13129037
$SRR13129038
$SRR13129039
$SRR13129040
$SRR13129041
$SRR13129042
$SRR13129043

Read quality plot

make_read_qual_plot <- function(json, which = "before") {
  js <- rjson::fromJSON(file = json)

  qual <- NULL
  if (which == "before") {
    qual <- js$read1_before_filtering$quality_curves
  } else {
    qual <- js$read1_after_filtering$quality_curves
  }

  data <- data.frame(
    pos = 1:length(qual$mean),
    Aa = qual$A,
    Tt = qual$T,
    Gg = qual$G,
    Cc = qual$C,
    Mm = qual$mean
  )

  data$Q20 <- 20
  data$Q28 <- 8
  data$Q50 <- 18

  cols <- rcartocolor::carto_pal(n = 10, name = "Safe")[c(2, 1, 3, 4, 10)]

  p <- plot_ly(data, x = ~pos) %>%
    add_trace(
      y = ~Q20, name = "Not good", fillcolor = "rgba(255,0,0, 0.2)", type = "scatter",
      mode = "none", stackgroup = "one", hoverinfo = "skip"
    ) %>%
    add_trace(
      y = ~Q28, name = "OK", fillcolor = "rgba(255,255,0,0.2)", type = "scatter",
      mode = "none", stackgroup = "one", hoverinfo = "skip"
    ) %>%
    add_trace(
      y = ~Q50, name = "Good", fillcolor = "rgba(0,128,0,0.2)", type = "scatter",
      mode = "none", stackgroup = "one", hoverinfo = "skip"
    ) %>%
    add_lines(
      y = ~Aa,
      name = "A",
      line = list(color = cols[1], width = 2),
      hoverinfo = "text",
      hovertext = paste("<b>", data$Aa, "</b>")
    ) %>%
    add_lines(
      y = ~Tt,
      name = "T",
      line = list(color = cols[2], width = 2),
      hoverinfo = "text",
      hovertext = paste("<b>", data$Tt, "</b>")
    ) %>%
    add_lines(
      y = ~Gg,
      name = "G",
      line = list(color = cols[3], width = 2),
      hoverinfo = "text",
      hovertext = paste("<b>", data$Gg, "</b>")
    ) %>%
    add_lines(
      y = ~Cc,
      name = "C",
      line = list(color = cols[4], width = 2),
      hoverinfo = "text",
      hovertext = paste("<b>", data$Cc, "</b>")
    ) %>%
    add_lines(
      y = ~Mm,
      name = "Mean",
      line = list(color = "black", width = 3, dash = "dot"),
      hoverinfo = "text",
      hovertext = paste("<b>", data$Mm, "</b>")
    ) %>%
    layout(
      title = "Per base sequence quality",
      xaxis = list(title = "Position in read (bp)"),
      yaxis = list(title = "Quality score"),
      hovermode = "x closest"
    )

  print(htmltools::tagList(p))
}

Before QC

parallel::mclapply(setNames(names(json_files), names(json_files)), function(x) {
  make_read_qual_plot(json = json_files[[x]], which = "before")
}, mc.preschedule = F, mc.cores = 4)
$SRR13129036
$SRR13129037
$SRR13129038
$SRR13129039
$SRR13129040
$SRR13129041
$SRR13129042
$SRR13129043

After QC

parallel::mclapply(setNames(names(json_files), names(json_files)), function(x) {
  make_read_qual_plot(json = json_files[[x]], which = "after")
}, mc.preschedule = F, mc.cores = 4)
$SRR13129036
$SRR13129037
$SRR13129038
$SRR13129039
$SRR13129040
$SRR13129041
$SRR13129042
$SRR13129043

Base content line plot

make_base_ratio_line_plot <- function(json, which = "before") {
  js <- rjson::fromJSON(file = json)

  qual <- NULL
  if (which == "before") {
    qual <- js$read1_before_filtering$content_curves
  } else {
    qual <- js$read1_after_filtering$content_curves
  }

  data <- data.frame(
    pos = 1:length(qual$A),
    Aa = qual$A * 100,
    Tt = qual$T * 100,
    Gg = qual$G * 100,
    Cc = qual$C * 100,
    GC = qual$GC * 100,
    Nn = qual$N * 100
  )

  cols <- rcartocolor::carto_pal(n = 11, name = "Safe")[c(2, 1, 3, 4, 10)]

  p <- plot_ly(data, x = ~pos) %>%
    add_lines(
      y = ~Aa,
      name = "A",
      line = list(color = cols[1], width = 2),
      hoverinfo = "text+x",
      hovertext = paste("<b>", data$Aa, "</b>")
    ) %>%
    add_lines(
      y = ~Tt,
      name = "T",
      line = list(color = cols[2], width = 2),
      hoverinfo = "text+x",
      hovertext = paste("<b>", data$Tt, "</b>")
    ) %>%
    add_lines(
      y = ~Gg,
      name = "G",
      line = list(color = cols[3], width = 2),
      hoverinfo = "text+x",
      hovertext = paste("<b>", data$Gg, "</b>")
    ) %>%
    add_lines(
      y = ~Cc,
      name = "C",
      line = list(color = cols[4], width = 2),
      hoverinfo = "text+x",
      hovertext = paste("<b>", data$Cc, "</b>")
    ) %>%
    add_lines(
      y = ~GC,
      name = "GC",
      line = list(color = cols[5], width = 3, dash = "dot"),
      hoverinfo = "text+x",
      hovertext = paste("<b>", data$GC, "</b>")
    ) %>%
    add_lines(
      y = ~Nn,
      name = "N",
      line = list(color = "black", width = 3),
      hoverinfo = "text+x",
      hovertext = paste("<b>", data$Nn, "</b>")
    ) %>%
    layout(
      title = "Per base sequence content",
      xaxis = list(title = "Position in read (bp)/ Cycle"),
      yaxis = list(title = "Percentage (%)"),
      hovermode = "x closest"
    )
  print(htmltools::tagList(p))
}

Before QC

parallel::mclapply(setNames(names(json_files), names(json_files)), function(x) {
  make_base_ratio_line_plot(json = json_files[[x]], which = "before")
}, mc.preschedule = F, mc.cores = 4)
$SRR13129036
$SRR13129037
$SRR13129038
$SRR13129039
$SRR13129040
$SRR13129041
$SRR13129042
$SRR13129043

After QC

parallel::mclapply(setNames(names(json_files), names(json_files)), function(x) {
  make_base_ratio_line_plot(json = json_files[[x]], which = "after")
}, mc.preschedule = F, mc.cores = 4)
$SRR13129036
$SRR13129037
$SRR13129038
$SRR13129039
$SRR13129040
$SRR13129041
$SRR13129042
$SRR13129043

Base content ratio proportion plot

make_base_ratio_prop_plot <- function(json, which = "before") {
  js <- rjson::fromJSON(file = json)

  qual <- NULL
  if (which == "before") {
    qual <- js$read1_before_filtering$content_curves
  } else {
    qual <- js$read1_after_filtering$content_curves
  }

  data <- data.frame(
    pos = 1:length(qual$A),
    Aa = qual$A * 100,
    Tt = qual$T * 100,
    Gg = qual$G * 100,
    Cc = qual$C * 100,
    GC = qual$GC * 100,
    Nn = qual$N * 100
  )

  cols <- rcartocolor::carto_pal(n = 11, name = "Safe")[c(2, 1, 3, 4, 10)]

  p <- plot_ly(data, x = ~pos) %>%
    add_trace(
      y = ~Nn, name = "N", fillcolor = "black", type = "scatter",
      mode = "none", stackgroup = "one", hoverinfo = "skip"
    ) %>%
    add_trace(
      y = ~Cc, name = "C", fillcolor = cols[4], type = "scatter",
      mode = "none", stackgroup = "one", hoverinfo = "skip"
    ) %>%
    add_trace(
      y = ~Gg, name = "G", fillcolor = cols[3], type = "scatter",
      mode = "none", stackgroup = "one", hoverinfo = "skip"
    ) %>%
    add_trace(
      y = ~Tt, name = "T", fillcolor = cols[2], type = "scatter",
      mode = "none", stackgroup = "one", hoverinfo = "skip"
    ) %>%
    add_trace(
      y = ~Aa, name = "A", fillcolor = cols[1], type = "scatter",
      mode = "none", stackgroup = "one", hoverinfo = "skip"
    ) %>%
    add_lines(
      y = ~GC,
      name = "GC",
      line = list(color = cols[5], width = 3, dash = "dot"),
      hoverinfo = "text",
      hovertext = paste("<b>", data$GC, "</b>")
    ) %>%
    layout(
      title = "Per base sequence content",
      xaxis = list(title = "Position in read (bp)/ Cycle"),
      yaxis = list(title = "Percentage (%)"),
      hovermode = "x closest"
    )
  print(htmltools::tagList(p))
}

Before QC

parallel::mclapply(setNames(names(json_files), names(json_files)), function(x) {
  make_base_ratio_prop_plot(json = json_files[[x]], which = "before")
}, mc.preschedule = F, mc.cores = 4)
$SRR13129036
$SRR13129037
$SRR13129038
$SRR13129039
$SRR13129040
$SRR13129041
$SRR13129042
$SRR13129043

After QC

parallel::mclapply(setNames(names(json_files), names(json_files)), function(x) {
  make_base_ratio_prop_plot(json = json_files[[x]], which = "after")
}, mc.preschedule = F, mc.cores = 4)
$SRR13129036
$SRR13129037
$SRR13129038
$SRR13129039
$SRR13129040
$SRR13129041
$SRR13129042
$SRR13129043

qckitfastq

Analysis

run_qc <- function(fq) {
  out <- list()

  infile <- fq
  fseq <- seqTools::fastqq(infile)

  ## Read length
  read_len <- read_length(fseq)
  # plot_read_length(read_len)

  ## Per base sequence quality
  bs <- per_base_quality(infile)
  # plot_per_base_quality(bs)

  ## Per read quality
  prq <- per_read_quality(infile)
  # plot_per_read_quality(prq)

  ## GC content
  gc_df <- GC_content(infile)
  # plot_GC_content(gc_df)

  ## Nucleotide read content
  rc <- read_content(fseq)
  # plot_read_content(rc)

  ## Kmer count
  km <- kmer_count(infile, k = 6)

  ## Overrep reads
  # overrep_reads <- overrep_reads(infile)
  # plot_overrep_reads(overrep_reads)

  ## Overrep kmer
  overkm <- overrep_kmer(infile, 7)
  # plot_overrep_kmer(overkm)

  ## Adapter content
  if (.Platform$OS.type != "windows") {
    ac_sorted <- adapter_content(infile)
    # plot_adapter_content(ac_sorted)
  } else {
    ac_sorted <- "adapter_content not available for Windows; skipping"
    print("adapter_content not available for Windows; skipping")
  }

  out <- list(
    read_length = read_len,
    per_base_quality = bs,
    per_read_quality = prq,
    GC_content = gc_df,
    read_content = rc,
    kmer_count = km,
    overrep_kmer = overkm,
    overrep_reads = overrep_reads,
    adapter_content = ac_sorted
  )
  return(out)
}


res1 <- parallel::mclapply(setNames(names(files), names(files)), function(x) {
  run_qc(fq = files[[x]])
}, mc.preschedule = F, mc.cores = 4)


res2 <- parallel::mclapply(setNames(names(trim_files), names(trim_files)), function(x) {
  run_qc(fq = trim_files[[x]])
}, mc.preschedule = F, mc.cores = 4)

Read length distribution

#' Read length distribution plot
#'
#' @param before_trim A dataframe with 2 columns: read_length and num_reads
#' @param after_trim A dataframe with 2 columns: read_length and num_reads
#'
#' @return An interactive barplot
#' @export
#'
#' @examples
read_length_plot <- function(before_trim, after_trim) {
  if (!colnames(before_trim) %in% c("read_length", "num_reads") |
    !colnames(after_trim) %in% c("read_length", "num_reads")) {
    message("Please check the columns of input")
  }
  raw <- before_trim
  trim <- after_trim
  raw$Group <- "Raw"
  trim$Group <- "Trimmed"

  p1 <- plot_ly(hoverinfo = "text", type = "bar", textposition = "auto") %>%
    add_trace(
      data = raw,
      legendgroup = ~Group,
      x = ~read_length, y = ~num_reads,
      name = ~Group,
      # text = ~`%Reads`,
      marker = list(color = "red"),
      hovertext = paste(
        "<b>Length:</b> ", raw$read_length,
        "<br><b>nReads:</b> ", raw$num_reads
        # , "<br><b>%Reads:</b> ", t$`%Reads`
      )
    ) %>%
    layout(
      title = "Histogram of read length distribution",
      xaxis = list(title = "Reads length"),
      yaxis = list(title = "No. of reads"),
      hovermode = "x closest"
    )

  p2 <- plot_ly(hoverinfo = "text", type = "bar", textposition = "auto") %>%
    add_trace(
      data = trim,
      legendgroup = ~Group,
      x = ~read_length, y = ~num_reads,
      name = ~Group,
      # text = ~`%Reads`,
      marker = list(color = "green"),
      hovertext = paste(
        "<b>Length:</b> ", trim$read_length,
        "<br><b>nReads:</b> ", trim$num_reads
        # , "<br><b>%Reads:</b> ", t$`%Reads`
      )
    ) %>%
    layout(
      title = "Histogram of read length distribution",
      xaxis = list(title = "Reads length"),
      yaxis = list(title = "No. of reads"),
      hovermode = "x closest"
    )
  p <- subplot(p1, p2, shareX = T, shareY = T)
  print(htmltools::tagList(p))
}

parallel::mclapply(setNames(names(res1), names(res1)), function(x) {
  read_length_plot(before_trim = res1[[x]]$read_length, after_trim = res2[[x]]$read_length)
}, mc.preschedule = F, mc.cores = 4)
$SRR13129036
$SRR13129037
$SRR13129038
$SRR13129039
$SRR13129040
$SRR13129041
$SRR13129042
$SRR13129043

References

report::cite_packages(session = sessionInfo())

SessionInfo

devtools::session_info() %>%
  details::details()

─ Session info ───────────────────────────────────────────────────────────────
 setting  value                       
 version  R version 4.0.4 (2021-02-15)
 os       Ubuntu 16.04.7 LTS          
 system   x86_64, linux-gnu           
 ui       X11                         
 language (EN)                        
 collate  en_US.UTF-8                 
 ctype    en_US.UTF-8                 
 tz       Europe/Zurich               
 date     2021-09-20                  

─ Packages ───────────────────────────────────────────────────────────────────
 package     * version date       lib source        
 assertthat    0.2.1   2019-03-21 [1] CRAN (R 4.0.4)
 bslib         0.2.5.1 2021-05-18 [1] CRAN (R 4.0.4)
 cachem        1.0.5   2021-05-15 [1] CRAN (R 4.0.4)
 callr         3.7.0   2021-04-20 [1] CRAN (R 4.0.4)
 cli           3.0.1   2021-07-17 [1] CRAN (R 4.0.4)
 clipr         0.7.1   2020-10-08 [1] CRAN (R 4.0.4)
 colorspace    2.0-2   2021-06-24 [1] CRAN (R 4.0.4)
 crayon        1.4.1   2021-02-08 [1] CRAN (R 4.0.4)
 crosstalk     1.1.1   2021-01-12 [1] CRAN (R 4.0.4)
 data.table    1.14.0  2021-02-21 [1] CRAN (R 4.0.4)
 DBI           1.1.1   2021-01-15 [1] CRAN (R 4.0.4)
 desc          1.3.0   2021-03-05 [1] CRAN (R 4.0.4)
 details       0.2.1   2020-01-12 [1] CRAN (R 4.0.4)
 devtools      2.4.2   2021-06-07 [1] CRAN (R 4.0.4)
 digest        0.6.27  2020-10-24 [1] CRAN (R 4.0.4)
 dplyr         1.0.7   2021-06-18 [1] CRAN (R 4.0.4)
 ellipsis      0.3.2   2021-04-29 [1] CRAN (R 4.0.4)
 evaluate      0.14    2019-05-28 [1] CRAN (R 4.0.4)
 fansi         0.5.0   2021-05-25 [1] CRAN (R 4.0.4)
 fastmap       1.1.0   2021-01-25 [1] CRAN (R 4.0.4)
 fs            1.5.0   2020-07-31 [1] CRAN (R 4.0.4)
 generics      0.1.0   2020-10-31 [1] CRAN (R 4.0.4)
 ggplot2     * 3.3.5   2021-06-25 [1] CRAN (R 4.0.4)
 glue          1.4.2   2020-08-27 [1] CRAN (R 4.0.4)
 gtable        0.3.0   2019-03-25 [1] CRAN (R 4.0.4)
 htmltools     0.5.1.1 2021-01-22 [1] CRAN (R 4.0.4)
 htmlwidgets   1.5.3   2020-12-10 [1] CRAN (R 4.0.4)
 httr          1.4.2   2020-07-20 [1] CRAN (R 4.0.4)
 jquerylib     0.1.4   2021-04-26 [1] CRAN (R 4.0.4)
 jsonlite      1.7.2   2020-12-09 [1] CRAN (R 4.0.4)
 knitr         1.33    2021-04-24 [1] CRAN (R 4.0.4)
 lazyeval      0.2.2   2019-03-15 [1] CRAN (R 4.0.4)
 lifecycle     1.0.0   2021-02-15 [1] CRAN (R 4.0.4)
 magrittr      2.0.1   2020-11-17 [1] CRAN (R 4.0.4)
 memoise       2.0.0   2021-01-26 [1] CRAN (R 4.0.4)
 munsell       0.5.0   2018-06-12 [1] CRAN (R 4.0.4)
 pillar        1.6.2   2021-07-29 [1] CRAN (R 4.0.4)
 pkgbuild      1.2.0   2020-12-15 [1] CRAN (R 4.0.4)
 pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.0.4)
 pkgload       1.2.1   2021-04-06 [1] CRAN (R 4.0.4)
 plotly      * 4.9.4.1 2021-06-18 [1] CRAN (R 4.0.4)
 plyr        * 1.8.6   2020-03-03 [1] CRAN (R 4.0.4)
 png           0.1-7   2013-12-03 [1] CRAN (R 4.0.4)
 prettyunits   1.1.1   2020-01-24 [1] CRAN (R 4.0.4)
 processx      3.5.2   2021-04-30 [1] CRAN (R 4.0.4)
 ps            1.6.0   2021-02-28 [1] CRAN (R 4.0.4)
 purrr         0.3.4   2020-04-17 [1] CRAN (R 4.0.4)
 qckitfastq  * 1.6.0   2020-10-27 [1] Bioconductor  
 R6            2.5.0   2020-10-28 [1] CRAN (R 4.0.4)
 Rcpp          1.0.7   2021-07-07 [1] CRAN (R 4.0.4)
 remotes       2.4.0   2021-06-02 [1] CRAN (R 4.0.4)
 reshape2      1.4.4   2020-04-09 [1] CRAN (R 4.0.4)
 Rfastp      * 1.0.0   2020-10-27 [1] Bioconductor  
 rjson         0.2.20  2018-06-08 [1] CRAN (R 4.0.4)
 rlang         0.4.11  2021-04-30 [1] CRAN (R 4.0.4)
 rmarkdown     2.9     2021-06-15 [1] CRAN (R 4.0.4)
 rprojroot     2.0.2   2020-11-15 [1] CRAN (R 4.0.4)
 RSeqAn        1.10.0  2020-10-27 [1] Bioconductor  
 sass          0.4.0   2021-05-12 [1] CRAN (R 4.0.4)
 scales        1.1.1   2020-05-11 [1] CRAN (R 4.0.4)
 sessioninfo   1.1.1   2018-11-05 [1] CRAN (R 4.0.4)
 stringi       1.5.3   2020-09-09 [1] CRAN (R 4.0.4)
 stringr       1.4.0   2019-02-10 [1] CRAN (R 4.0.4)
 testthat      3.0.4   2021-07-01 [1] CRAN (R 4.0.4)
 tibble        3.1.3   2021-07-23 [1] CRAN (R 4.0.4)
 tidyr         1.1.3   2021-03-03 [1] CRAN (R 4.0.4)
 tidyselect    1.1.1   2021-04-30 [1] CRAN (R 4.0.4)
 usethis       2.0.1   2021-02-10 [1] CRAN (R 4.0.4)
 utf8          1.2.2   2021-07-24 [1] CRAN (R 4.0.4)
 vctrs         0.3.8   2021-04-29 [1] CRAN (R 4.0.4)
 viridisLite   0.4.0   2021-04-13 [1] CRAN (R 4.0.4)
 withr         2.4.2   2021-04-18 [1] CRAN (R 4.0.4)
 xfun          0.24    2021-06-15 [1] CRAN (R 4.0.4)
 xml2          1.3.2   2020-04-23 [1] CRAN (R 4.0.4)
 yaml          2.2.1   2020-02-01 [1] CRAN (R 4.0.4)
 zlibbioc      1.36.0  2020-10-27 [1] Bioconductor  

[1] /home/ubuntu/R/x86_64-pc-linux-gnu-library/4.0
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library